home *** CD-ROM | disk | FTP | other *** search
- /* Copyright © 1995 Gustav Larsson */
-
- /* ------------------------------------------------------ */
- /* Constants & Types */
-
- #define ALPHABET_SIZE 256
-
- #define ALLOC_SIZE(n) ((n+3) & -4L) /* next multiple of 4 */
-
- #define HASH_BUCKETS 1024 /* must be power of 2 */
- #define HASH_MASK (HASH_BUCKETS - 1)
-
- #define NO_NULL_CHAR 'A'
-
- #define NULL 0
- //#define TRUE 1
- //#define FALSE 0
-
- typedef unsigned char uchar;
- typedef unsigned short ushort;
- typedef unsigned long ulong;
-
- typedef struct Word Word;
- typedef struct Occurrence Occurrence;
- typedef struct Private Private;
-
- /*
- A block of occurrence positions. We pack in as many
- occurrences as possible into a single block, from 3 to 6
- depending on textLength.
-
- The first entry in the block is always used. The
- remaining entries are in use if they are not zero.
- These facts are used several places to simplify the code.
- */
-
- struct Occurrence {
- Occurrence *next;
- union {
- ushort pos2[6]; /* 2 bytes/occurrence */
- struct {
- ushort lo[4];
- uchar hi[4];
- } pos3; /* 3 bytes/occurrence */
- long pos4[3]; /* 4 bytes/occurrence */
- } p;
- };
-
- /*
- There is one Word struct for each distince word.
- The word's length is stored in the top eight bits
- of the hash value. There's no need to store the
- characters in the word since we can just look at
- the first occurrence (first entry in Word.first).
- */
-
- struct Word {
- Word *next;
- ulong hash;
- Occurrence *last;
- Occurrence first;
- };
-
- /*
- The structure of our private storage. The hashCodes[]
- array serves two purposes: it distinguishes alphanumeric
- from non-alphanumeric characters, and it provides a
- non-zero hash code for each alphanumeric character.
- The endParsedText field will be -1 if there was enough
- private memory to parse all the text. Otherwise, it points
- to the start of the unparsed text. nullChar is used by
- the BMH_Search() function when we must search unparsed
- text for an occurrence.
- */
-
- struct Private {
- ulong hashCodes [ ALPHABET_SIZE ];
- Word *hashTable [ HASH_BUCKETS ];
- long endParsedText; /* start of parsed text */
- long posBytes; /* POS_x_BYTES, below */
- char nullChar; /* char not appearing in the text */
- long heap; /* start of private heap */
- };
-
- /* ------------------------------------------------------ */
- /* Macros */
-
- /*
- These macros simplify access to the occurrence positions
- stored in an Occurrence struct. Posbytes is a macro
- argument that is usually set to private->posBytes.
- However, you can also use a constant for posbytes, which
- lets the compiler choose the right case at compile time,
- producing smaller and faster code.
- */
-
- #define POS_2_BYTES 1 /* word position fits in 2 bytes */
- #define POS_3_BYTES 0 /* fits in 3 bytes; usual case */
- #define POS_4_BYTES 2 /* fits in 4 bytes */
-
- #define GET_POS(pos,occur,index,posbytes) \
- { \
- if ( (posbytes) == POS_3_BYTES ) \
- pos = ((long)(occur)->p.pos3.hi[index] << 16) \
- + (occur)->p.pos3.lo[index]; \
- else if ( (posbytes) == POS_2_BYTES ) \
- pos = (occur)->p.pos2[index]; \
- else \
- pos = (occur)->p.pos4[index]; \
- }
-
- #define SET_POS(pos,occur,index,posbytes) \
- { \
- if ( (posbytes) == POS_3_BYTES ) \
- { \
- (occur)->p.pos3.hi[index] = (pos) >> 16; \
- (occur)->p.pos3.lo[index] = (pos); \
- } \
- else if ( (posbytes) == POS_2_BYTES ) \
- (occur)->p.pos2[index] = pos; \
- else \
- (occur)->p.pos4[index] = pos; \
- }
-
- /* ------------------------------------------------------ */
- /* Prototypes */
-
- void InitFind (
- char *textToSearch,
- long textLength,
- void *privateStorage,
- long storageSize
- );
-
- long FindWordOccurrence (
- char *wordToFind,
- long wordLength,
- long occurrenceToFind,
- char *textToSearch,
- long textLength,
- void *privateStorage,
- long storageSize
- );
-
- static long InitFindBody (
- uchar *textToSearch,
- long textLength,
- Private *private,
- uchar *endPrivateStorage
- );
-
- static Word *LookupWord (
- Private *private,
- char *textToSearch,
- char *wordText,
- long wordLength,
- ulong hash
- );
-
- static char PickNullChar (
- Private *private,
- uchar *textStart, /* start of unparsed text */
- uchar *textEnd /* end of all text */
- );
-
- static char *BMH_Search(
- ulong *hashCodes, /* private->hashCodes */
- char *wordToFind,
- long wordLength,
- long occurrenceToFind, /* 0 is 1st occurrence */
- char *textStart, /* start of unparsed text */
- char *textEnd, /* end of all text */
- char nullChar /* private->nullChar */
- );
-
- static char *SimpleSearch(
- ulong *hashCodes, /* private->hashCodes */
- char *wordToFind,
- long wordLength, /* 1..3 */
- long occurrenceToFind, /* 0 is 1st occurrence */
- char *textStart, /* start of unparsed text */
- char *textEnd /* end of all text */
- );
-
- /* ------------------------------------------------------ */
- /* InitFind */
-
- void InitFind (
- char *textToSearch,
- long textLength,
- void *privateStorage,
- long storageSize
- )
- {
- Private *private = privateStorage;
-
- private->endParsedText =
- InitFindBody(
- (uchar *)textToSearch,
- textLength,
- privateStorage,
- (uchar *)privateStorage + storageSize
- );
-
- if ( private->endParsedText != -1 )
- private->nullChar =
- PickNullChar(
- private,
- (uchar *)textToSearch + private->endParsedText,
- (uchar *)textToSearch + textLength );
- else
- private->nullChar = NO_NULL_CHAR;
- }
-
- /* ------------------------------------------------------ */
- /* InitFindBody */
-
- /*
- This function does most of the work for InitFind().
- The arguments have been recast into a more useful form;
- uchar and ulong are used a lot so that we don't have to
- worry about the sign, especially when indexing
- private->hashCodes[].
-
- The return value is the character position when the
- unparsed text begins (if we run out of private storage),
- or -1 if all the text was parsed.
- */
-
- static long InitFindBody (
- uchar *textToSearch,
- long textLength,
- Private *private,
- uchar *endPrivateStorage
- )
- {
- uchar *alloc, *textPos, *textEnd, *wordStart;
- long wordLength;
- ulong hash, code;
- Word *word;
- Occurrence *occur;
-
- /*
- Init table of hash codes. The remaining entries are
- guaranteed to be initialized to zero. The hash codes
- were chosen so that any two codes differ by at least
- five bits.
- */
-
- {
- ulong *table = private->hashCodes; /* reduces typing */
-
- table['0'] = 0xFFC0; table['5'] = 0xF492;
- table['1'] = 0xFE07; table['6'] = 0xF31E;
- table['2'] = 0xF98B; table['7'] = 0xF2D9;
- table['3'] = 0xF84C; table['8'] = 0xCF96;
- table['4'] = 0xF555; table['9'] = 0xCE51;
-
- table['A'] = 0xC9DD; table['N'] = 0xA245;
- table['B'] = 0xC81A; table['O'] = 0x9F0A;
- table['C'] = 0xC503; table['P'] = 0x9ECD;
- table['D'] = 0xC4C4; table['Q'] = 0x9941;
- table['E'] = 0xC348; table['R'] = 0x9886;
- table['F'] = 0xC28F; table['S'] = 0x959F;
- table['G'] = 0xAF5C; table['T'] = 0x9458;
- table['H'] = 0xAE9B; table['U'] = 0x93D4;
- table['I'] = 0xA917; table['V'] = 0x9213;
- table['J'] = 0xA8D0; table['W'] = 0x6DD3;
- table['K'] = 0xA5C9; table['X'] = 0x6C14;
- table['L'] = 0xA40E; table['Y'] = 0x6B98;
- table['M'] = 0xA382; table['Z'] = 0x6A5F;
-
- table['a'] = 0x6746; table['n'] = 0x3C88;
- table['b'] = 0x6681; table['o'] = 0x3B04;
- table['c'] = 0x610D; table['p'] = 0x3AC3;
- table['d'] = 0x60CA; table['q'] = 0x37DA;
- table['e'] = 0x5D85; table['r'] = 0x361D;
- table['f'] = 0x5C42; table['s'] = 0x3191;
- table['g'] = 0x5BCE; table['t'] = 0x3056;
- table['h'] = 0x5A09; table['u'] = 0x0D19;
- table['i'] = 0x5710; table['v'] = 0x0CDE;
- table['j'] = 0x56D7; table['w'] = 0x0B52;
- table['k'] = 0x515B; table['x'] = 0x0A95;
- table['l'] = 0x509C; table['y'] = 0x078C;
- table['m'] = 0x3D4F; table['z'] = 0x064B;
- }
-
- /*
- Determine the number of bytes needed to store
- each occurrence position.
- */
-
- if ( textLength <= 0x10000L )
- private->posBytes = POS_2_BYTES;
- else if ( textLength <= 0x1000000L )
- private->posBytes = POS_3_BYTES;
- else
- private->posBytes = POS_4_BYTES;
-
- /*
- Set up variables to handle allocation of
- private storage.
- */
-
- alloc = (uchar *)&private->heap;
-
- /* Parse the text */
-
- textPos = textToSearch;
- textEnd = textPos + textLength;
-
- while ( textPos != textEnd )
- {
- /* Search for start of word */
- while ( private->hashCodes[*textPos] == 0 )
- {
- textPos++;
- if ( textPos == textEnd )
- return -1; /* parse all text */
- }
- wordStart = textPos;
-
- /* Search for end of word; generate hash value too */
- hash = 0;
- while ( textPos != textEnd &&
- (code = private->hashCodes[ *textPos ]) != 0 )
- {
- hash = (hash << 1) ^ code;
- textPos++;
- }
- wordLength = textPos - wordStart;
- hash = (hash & 0xFFFFFF) | (wordLength << 24);
-
- /*
- Record the occurrence. First we see if a Word struct
- exists for this word and whether we need to allocate
- a new Occurrence struct.
- */
-
- word = LookupWord(
- private,
- (char *)textToSearch,
- (char *)wordStart,
- wordLength,
- hash );
- if ( word )
- {
- long allocateNewBlock, blockSize, i, pos;
-
- /*
- This word has occurred before, so it already has
- a Word struct. See if there's room in the last
- Occurrence block for another entry. Remember that
- entry #0 in the Occurrence block is always in use,
- so we can start checking at entry #1 for a non-zero
- entry.
- */
-
- occur = word->last;
- allocateNewBlock = TRUE;
- switch ( private->posBytes )
- {
- case POS_2_BYTES: blockSize = 6; break;
- case POS_3_BYTES: blockSize = 4; break;
- case POS_4_BYTES: blockSize = 3; break;
- }
-
- for ( i = 1; i < blockSize; i++ )
- {
- GET_POS( pos, occur, i, private->posBytes )
- if ( pos == 0 )
- {
- SET_POS( wordStart - textToSearch, occur, i,
- private->posBytes )
- allocateNewBlock = FALSE;
- break;
- }
- }
-
- if ( allocateNewBlock )
- {
- /* Block is full. Allocate new Occurrence block */
- occur = (Occurrence *) alloc;
- alloc += ALLOC_SIZE( sizeof(Occurrence) );
- if ( alloc >= endPrivateStorage )
- return wordStart-textToSearch; /* out of memory */
-
- /*
- Init the new struct and link it to the end
- of the occurence list.
- */
- SET_POS( wordStart - textToSearch, occur, 0,
- private->posBytes )
- word->last->next = occur;
- word->last = occur;
- }
- }
- else
- {
- long i;
-
- /*
- This is a new word. Allocate a new Word struct,
- which contains an Occurrence struct too.
- */
- word = (Word *) alloc;
- alloc += ALLOC_SIZE( sizeof(Word) );
- if ( alloc >= endPrivateStorage )
- return wordStart-textToSearch ; /* out of memory */
-
- /*
- Link it to the start of the Word list, coming off
- the hash table.
- */
- word->next = private->hashTable[ hash & HASH_MASK ];
- private->hashTable[ hash & HASH_MASK ] = word;
-
- /* Init the Word struct */
- word->hash = hash;
- word->last = &word->first;
-
- /* Init the Occurrence struct */
- SET_POS( wordStart - textToSearch, &word->first, 0,
- private->posBytes )
- }
- }
-
- /* Finished parsing text */
- return -1;
- }
-
- /* ------------------------------------------------------ */
- /* FindWordOccurrence */
-
- long FindWordOccurrence (
- char *wordToFind,
- long wordLength,
- long occurrenceToFind,
- char *textToSearch,
- long textLength,
- void *privateStorage,
- long storageSize
- )
- {
- Private *private = privateStorage;
- Word *word;
- ulong hash;
-
- /* Make occurenceToFind zero-based */
- occurrenceToFind--;
-
- /* Generate hash value for word to find */
- hash = 0;
- {
- long remain = wordLength;
- uchar *p = (uchar *) wordToFind;
- while ( remain > 0 )
- {
- hash = (hash << 1) ^ private->hashCodes[*p++];
- remain--;
- }
- hash = (hash & 0xFFFFFF) | (wordLength << 24);
- }
-
- /* Look for word/occurrence in hash table */
- word = LookupWord( private, textToSearch, wordToFind,
- wordLength, hash );
- if ( word )
- {
- Occurrence *occur = &word->first;
- long blockSize, pos, i;
-
- /*
- Word exists in hash table, so go down the
- occurrence list.
- */
-
- switch ( private->posBytes )
- {
- case POS_2_BYTES: blockSize = 6; break;
- case POS_3_BYTES: blockSize = 4; break;
- case POS_4_BYTES: blockSize = 3; break;
- }
-
- while ( occur && occurrenceToFind >= blockSize )
- {
- occurrenceToFind -= blockSize;
- occur = occur->next;
- }
-
- if ( occur )
- {
- GET_POS( pos, occur, occurrenceToFind,
- private->posBytes )
- if ( occurrenceToFind == 0 || pos != 0 )
- return pos;
- occurrenceToFind -= blockSize;
- }
-
- occur = word->last;
- for ( i = 0; i < blockSize; i++ )
- {
- GET_POS( pos, occur, i, private->posBytes )
- if ( pos == 0 )
- occurrenceToFind++;
- }
- }
-
- /* Not in parsed text, so check the unparsed text */
- if ( private->endParsedText != -1 )
- {
- char *p;
- if ( wordLength > 3 )
- p = BMH_Search(
- private->hashCodes,
- wordToFind,
- wordLength,
- occurrenceToFind,
- textToSearch + private->endParsedText,
- textToSearch + textLength,
- private->nullChar );
- else
- p = SimpleSearch(
- private->hashCodes,
- wordToFind,
- wordLength,
- occurrenceToFind,
- textToSearch + private->endParsedText,
- textToSearch + textLength );
- if (p)
- return (p - textToSearch);
- }
-
- /* Not found */
- return -1;
- }
-
- /* ------------------------------------------------------ */
- /* LookupWord */
-
- /* Look up a word in the hash table */
-
- static Word *LookupWord (
- Private *private,
- char *textToSearch,
- char *wordText,
- long wordLength,
- ulong hash
- )
- {
- Word *word = private->hashTable[ hash & HASH_MASK ];
- while ( word )
- {
- if ( word->hash == hash )
- {
- char *w1, *w2;
- long pos, remain = wordLength;
-
- /*
- The hash values match, so compare characters to make
- sure it's the right word. We already know the word
- length is correct since the length is contained
- in the upper eight bits of the hash value.
- */
- GET_POS( pos, &word->first, 0, private->posBytes )
- w1 = textToSearch + pos;
- w2 = wordText;
- while ( remain-- > 0 && *w1++ == *w2++ )
- ;
- if ( remain == -1 )
- return word;
- }
- word = word->next;
- }
- return NULL;
- }
-
- /* ------------------------------------------------------ */
- /* PickNullChar */
-
- /*
- Find a character that doesn't appear anywhere in
- the unparsed text. BMH_Search() is faster if such
- a character can be found.
- */
-
- static char PickNullChar (
- Private *private,
- uchar *textStart,
- uchar *textEnd
- )
- {
- long i;
- uchar *p, occurs[ ALPHABET_SIZE ];
-
- for ( i = 0; i < ALPHABET_SIZE; i++ )
- occurs[i] = FALSE;
-
- for ( p = textStart; p < textEnd; p++ )
- occurs[*p] = TRUE;
-
- for ( i = 0; i < ALPHABET_SIZE; i++ )
- if ( occurs[i] == FALSE && private->hashCodes[i] == 0 )
- return i;
-
- return NO_NULL_CHAR;
- }
-
- /* ------------------------------------------------------ */
- /* BMH_Search */
-
- /*
- Search the unparsed text using the Boyer-Moore-Horspool
- algorithm. Ideally a null character is supplied (one that
- appears in neither the search string nor the text being
- searched). This allows the inner loop to be faster.
- */
-
- static char *BMH_Search (
- ulong *hashCodes, /* private->hashCodes */
- char *wordToFind,
- long wordLength,
- long occurrenceToFind, /* 0 is first occurrence */
- char *textStart, /* start of unparsed text */
- char *textEnd, /* end of unparsed text */
- char nullChar /* private->nullChar */
- )
- {
- long i;
- char *text, *wordEnd;
- char word[256];
- long offset[ ALPHABET_SIZE ];
-
- /*
- Copy the search string to a private buffer, where
- the first character is the null character.
- */
-
- word[0] = nullChar;
- for ( i = 0; i < wordLength; i++ )
- word[i+1] = wordToFind[i];
-
- /* Set up the offset[] lookup table */
-
- for ( i = 0; i < ALPHABET_SIZE; i++ )
- offset[i] = wordLength;
-
- for ( i = 1; i < wordLength; i++ )
- offset[ word[i] ] = wordLength - i;
-
- /* Let the search begin... */
-
- wordEnd = word + wordLength;
- text = textStart + wordLength - 1;
-
- if ( nullChar == NO_NULL_CHAR )
- {
- /* No null character, so use a slower inner loop */
- while ( text < textEnd )
- {
- long i;
- char *p, *q;
- for ( i = wordLength, p = wordEnd, q = text;
- i > 0 && *p == *q;
- i--, p--, q-- )
- ;
-
- /*
- If i == 0, we have found the search string.
- Now we make sure that it is delimited.
- */
- if ( i == 0 && hashCodes[*q] == 0 &&
- (text+1 == textEnd || hashCodes[text[1]] == 0) )
- {
- if ( occurrenceToFind == 0 )
- return q+1;
- occurrenceToFind--;
- }
-
- text += offset[*text];
- }
- }
- else
- {
- /*
- There is a null character (usual case), so
- we can use a faster and simpler inner loop.
- */
- while ( text < textEnd )
- {
- char *p, *q;
- for ( p = wordEnd, q = text; *p == *q; p--, q-- )
- ;
- if ( p == word && hashCodes[*q] == 0 &&
- (text+1 == textEnd || hashCodes[text[1]] == 0) )
- {
- if ( occurrenceToFind == 0 )
- return q+1;
- occurrenceToFind--;
- }
- text += offset[*text];
- }
- }
- return NULL;
- }
-
- /* ------------------------------------------------------ */
- /* SimpleSearch */
-
- /*
- Search the unparsed text using a simple search algorithm.
- Note that wordLength must be 1, 2, or 3. This algorithm
- runs faster than BMH_Search() for small search strings.
- */
-
- static char *SimpleSearch(
- ulong *hashCodes, /* private->hashCodes */
- char *wordToFind,
- long wordLength, /* 1..3 */
- long occurrenceToFind, /* 0 is 1st occurrence */
- char *textStart, /* start of unparsed text */
- char *textEnd /* end of all text */
- )
- {
- char *text, first;
-
- first = wordToFind[0];
- text = textStart;
-
- if ( wordLength == 1 )
- {
- while ( text < textEnd )
- {
- while ( text < textEnd && *text != first )
- text++;
- if ( hashCodes[*(text-1)] == 0 &&
- hashCodes[text[wordLength]] == 0 )
- {
- if ( occurrenceToFind == 0 )
- return text;
- occurrenceToFind--;
- }
- text++;
- }
- }
- else if ( wordLength == 2 )
- {
- while ( text < textEnd )
- {
- while ( text < textEnd && *text != first )
- text++;
- if ( text[1] == wordToFind[1] &&
- hashCodes[*(text-1)] == 0 &&
- hashCodes[text[wordLength]] == 0 )
- {
- if ( occurrenceToFind == 0 )
- return text;
- occurrenceToFind--;
- }
- text++;
- }
- }
- else /* wordLength == 3 */
- {
- while ( text < textEnd )
- {
- while ( text < textEnd && *text != first )
- text++;
- if ( text[1] == wordToFind[1] &&
- text[2] == wordToFind[2] &&
- hashCodes[*(text-1)] == 0 &&
- hashCodes[text[wordLength]] == 0 )
- {
- if ( occurrenceToFind == 0 )
- return text;
- occurrenceToFind--;
- }
- text++;
- }
- }
- return NULL;
- }
-
-
-
-